﻿using Abot.Crawler;
using Abot.Poco;
using Newtonsoft.Json;
using System;

namespace FC.Spider
{
    /// <summary>
    /// 蜘蛛
    /// </summary>
    public abstract class Spider
    {
        /// <summary>
        /// 目标地址
        /// </summary>
        public string BaseUrl { get; set; }

        /// <summary>
        /// 释放爬虫
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public CrawlResult Start(string url = null)
        {
            var crawler = ConfiguredWebCrawler();
            return crawler.Crawl(new Uri(url ?? BaseUrl));
        }

        /// <summary>
        /// 配置爬虫属性
        /// </summary>
        /// <returns></returns>
        protected virtual IWebCrawler ConfiguredWebCrawler()
        {
            //创建配置文件
            CrawlConfiguration config = new CrawlConfiguration();

            #region 配置

            //连接超时
            config.CrawlTimeoutSeconds = 0;
            //下载类容格式
            config.DownloadableContentTypes = "text/html, text/plain";
            //是否爬扩展页面
            config.IsExternalPageCrawlingEnabled = false;
            //是否爬扩展连接
            config.IsExternalPageLinksCrawlingEnabled = false;
            //是尊重 rebots.txt
            config.IsRespectRobotsDotTextEnabled = false;
            //是否多重复爬Uri,一般为false,但我估计太大，内存受不了，应为内存会存是否爬过的数据
            config.IsUriRecrawlingEnabled = false;
            //请求的最大线程，看IIS的支持，太大服务器受不了
            config.MaxConcurrentThreads = Environment.ProcessorCount;
            //最大爬的页码连接，如果为0就没有限制，看需求大小
            config.MaxPagesToCrawl = 1000;
            //单页面最大的爬页面量，如果为0就没有限制，基本都为0
            config.MaxPagesToCrawlPerDomain = 0;
            //每爬一个页面等好多毫秒，太快CUP会受不了
            config.MinCrawlDelayPerDomainMilliSeconds = 1000;

            #endregion

            //创建一个爬虫实例
            var crawler = new PoliteWebCrawler(config, null, null, null, null, null, null, null, null);

            #region 设置爬虫事件

            //是否爬该网页
            crawler.ShouldCrawlPage(ShouldCrawlPage);
            //是否爬该网页连接
            crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);
            //是否下载该页面
            crawler.ShouldDownloadPageContent(ShouldDownloadPageContent);
            //单个页面爬取开始 
            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            //单个页面爬取结束 
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompletedAsync;
            //页面不允许爬取事件
            crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            //页面链接不允许爬取事件
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            #endregion

            // 返回爬虫实例
            return crawler;
        }

        /// <summary>
        /// 页面不允许爬取事件
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected virtual void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;
            Console.WriteLine("页面不允许爬取:" + pageToCrawl.Uri.AbsoluteUri);
        }

        /// <summary>
        /// 页面链接不允许爬取事件
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected virtual void crawler_PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e) { }

        /// <summary>
        /// 单个页面爬取开始
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected virtual void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) { }

        /// <summary>
        /// 单个页面爬取事件
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected abstract void crawler_ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e);

        /// <summary>
        /// 定义什么情况下 要爬取页面
        /// </summary>
        protected virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext context)
        {
            return new CrawlDecision { Allow = true };
        }

        /// <summary>
        /// 定义什么情况下 要下载该页面
        /// </summary>
        protected virtual CrawlDecision ShouldDownloadPageContent(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            return new CrawlDecision { Allow = true };
        }

        /// <summary>
        /// 是否爬去网页连接
        /// </summary>
        /// <param name="crawledPage"></param>
        /// <param name="crawlContext"></param>
        /// <returns></returns>
        protected virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            return new CrawlDecision { Allow = false, Reason = "不要爬取页面链接" };
        }
    }
}
